The dataset we will work with here comes from the http://www.tennis-data.co.uk/alldata.php and you can find a lot of information there! It cover all data related to tennis matches from 2000 to 2020!! So it's a great dataset to train and practice your data science skills!!
This notebook will cover:
import pandas as pd
from urllib.request import urlopen
import os.path as osp
import os
import logging
import zipfile
from glob import glob
logging.getLogger().setLevel('INFO')
def download_file(url_str, path):
url = urlopen(url_str)
output = open(path, 'wb')
output.write(url.read())
output.close()
def extract_file(archive_path, target_dir):
zip_file = zipfile.ZipFile(archive_path, 'r')
zip_file.extractall(target_dir)
zip_file.close()
BASE_URL = 'http://tennis-data.co.uk'
DATA_DIR = "tennis_data"
ATP_DIR = './{}/ATP'.format(DATA_DIR)
WTA_DIR = './{}/WTA'.format(DATA_DIR)
ATP_URLS = [BASE_URL + "/%i/%i.zip" % (i,i) for i in range(2000,2019)]
WTA_URLS = [BASE_URL + "/%iw/%i.zip" % (i,i) for i in range(2007,2019)]
os.makedirs(osp.join(ATP_DIR, 'archives'), exist_ok=True)
os.makedirs(osp.join(WTA_DIR, 'archives'), exist_ok=True)
for files, directory in ((ATP_URLS, ATP_DIR), (WTA_URLS, WTA_DIR)):
for dl_path in files:
logging.info("downloading & extracting file %s", dl_path)
archive_path = osp.join(directory, 'archives', osp.basename(dl_path))
download_file(dl_path, archive_path)
extract_file(archive_path, directory)
ATP_FILES = sorted(glob("%s/*.xls*" % ATP_DIR))
WTA_FILES = sorted(glob("%s/*.xls*" % WTA_DIR))
df_atp = pd.concat([pd.read_excel(f) for f in ATP_FILES], ignore_index=True)
df_wta = pd.concat([pd.read_excel(f) for f in WTA_FILES], ignore_index=True)
logging.info("%i matches ATP in df_atp", df_atp.shape[0])
logging.info("%i matches WTA in df_wta", df_wta.shape[0])
# TON_DIR = "" + /data_4q .csv
# df4 = pd.read_csv("TON_DIR", index_col=0, low_memory=False)
try:
df4 = pd.read_csv("../input/data-4q/data_4q .csv", index_col=0, low_memory=False)
except:
print("\n\n\n\n\n\n\n\n\n\nIl semble que tu as oublié de faire l'upload du dataset!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
df_atp.describe()
df4.loc[:,df4.columns[18:]]
It could be interessant to calculate the winning percentage from previous matches of each player for each time he plays a match. To do so, we can run the cells right after. But as it takes some time (10~15min) I'll leave commented!
# def prior_wins_percentage(date, player, df, min_games=5):
# df_prior = df[df["Date"] < date]
# prior_wins = df_prior[df_prior["Winner"] == player].shape[0]
# prior_losses = df_prior[df_prior["Loser"] == player].shape[0]
# # We set a minimum number of games to avoid extra-high win rates
# # (e.g. like at a professional career debut)
# if (prior_wins + prior_losses) < min_games:
# return 0
# return prior_wins / (prior_wins + prior_losses)
# df_atp["winner_past_vict"] = df_atp.apply(lambda x: prior_wins_percentage(x["Date"],x["Winner"],df_atp),axis=1)
# df_atp["loser_past_vict"] = df_atp.apply(lambda x: prior_wins_percentage(x["Date"],x["Loser"],df_atp),axis=1)
#data.to_csv("data_4q.csv")
Checking (uncomment la cellule suivante)
#df_atp.tail()
To predict who will win a match, we define 3 main functions:
import lightgbm as lgb
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
conf = {}
acu = {}
roc = {}
y_predictions = {}
def get_data_splits(dataframe, valid_fraction=0.1):
valid_fraction = 0.1
valid_size = int(len(dataframe) * valid_fraction)
train = dataframe[:-valid_size * 2]
# valid size == test size, last two sections of the data
valid = dataframe[-valid_size * 2:-valid_size]
test = dataframe[-valid_size:]
print(f"Train size : {len(train)}\nValidation size : {len(valid)}\nTest size : {len(test)}")
return train, valid, test
def train_model(train, valid,test,over,n):
feature_cols = train.columns.drop('Labels')
dtrain = lgb.Dataset(train[feature_cols], label=train['Labels'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['Labels'])
if over:
param = {'num_leaves': 31, 'objective': 'binary', "max_depth": 3,
'metric': 'auc', 'seed': 7, 'reg_alpha':0.5, 'reg_lambda':0.5}
print(f"Regularization : l1 = {param['reg_alpha']}, l2 = {param['reg_lambda']}")
else:
param = {'num_leaves': 64, 'objective': 'binary',
'metric': 'auc', 'seed': 7}
print(f"No regularization!")
evals_result = {}
bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid,dtrain],
early_stopping_rounds=10, verbose_eval=10, evals_result=evals_result)
nameModel = "Model " + str(n) +".txt"
import joblib
# save model
joblib.dump(bst, nameModel)
evaluate(bst,feature_cols,evals_result,n)
def evaluate(bst,feature_cols,evals_result,n):
valid_pred = bst.predict(valid[feature_cols])
valid_score = metrics.roc_auc_score(valid['Labels'], valid_pred)
test_pred = bst.predict(test[feature_cols])
test_score = metrics.roc_auc_score(test['Labels'], test_pred)
print(f"Validation AUC score: {valid_score:.4f}")
print(f"Test AUC score: {test_score:.4f}")
plot(evals_result,valid_pred,bst,feature_cols,test[feature_cols],n)
def plot(evals_result,valid_pred,bst,feature_cols,test,n):
global conf
global auc
global roc
global y_predictions
if evals_result != None:
acu[n] = evals_result
fig1 = plt.figure(figsize=(45,10))
#print('Plot metrics during training... Our metric : ', param["metric"])
#print("evals_ results : ", evals_result)
lgb.plot_metric(evals_result, metric='auc',figsize=(35,10))
plt.xlabel('Iterations',fontsize=20)
plt.ylabel('auc',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("AUC during training",fontsize=20)
plt.legend(fontsize=20)
plt.show()
##### CONFUSION MATRIX
th = 0.5
y_pred_class = valid_pred > th
y_predictions[n] = y_pred_class
cm = confusion_matrix(valid["Labels"], y_pred_class)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn)
fnr = fn / (tp + fn)
tnr = tn / (tn + fp)
tpr = tp / (tp + fn)
numberModel = n
conf[n] = {'fpr':f'{fpr:.3f}','fnr': f'{fnr:.3f}', 'tnr' : f'{tnr:.3f}', "tpr": f'{tpr:.3f}'}
if n > 1 and fpr != 0 and fnr != 0 and tnr != 0 and tpr != 0:
conf["ratio " + str(n) + "/" + str(n-1)] = {"fp":f'{float(conf[n]["fpr"])/float(conf[n-1]["fpr"]):.3f}', \
"fn":f'{float(conf[n]["fnr"])/float(conf[n-1]["fnr"]):.3f}', \
"tn":f'{float(conf[n]["tnr"])/float(conf[n-1]["tnr"]):.3f}', \
"tp":f'{float(conf[n]["tpr"])/float(conf[n-1]["tpr"]):.3f}'}
fig2 = plt.figure(figsize=(35,10))
fig2.add_subplot(1,2,1)
sns.heatmap(cm, annot = True, fmt='d', cmap="Blues", vmin = 0.2,linewidths=.5,annot_kws={"fontsize": 20}); #cbar_kws={"fontsize": 20},annot_kws={"fontsize": 20}
sns.set(font_scale=2)
plt.title('Confusion Matrix',fontsize=20)
plt.ylabel('True Class',fontsize=20)
plt.xlabel('Predicted Class',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.text(0.1, 0.3, f' FPR: {fpr:.3f}\n FNR: {fnr:.3f}\n TNR: {tnr:.3f}\n TPR: {tpr:.3f}', style='italic',
bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5}, fontsize=14)
#Print Area Under Curve
fig2.add_subplot(1,2,2)
false_positive_rate, recall, thresholds = roc_curve(valid["Labels"], valid_pred)
roc_auc = auc(false_positive_rate, recall)
roc[n] = {'fpr':false_positive_rate,'recall':recall}
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall',fontsize=20)
plt.xlabel('Fall-out (1-Specificity)',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
display(conf)
def standard(df,feat):
scaler = StandardScaler()
df[[*feat]] = pd.DataFrame(scaler.fit_transform(df.drop(["Labels"], axis=1)),columns=feat) #"winner_past_vict","loser_past_vict"
#df[[*feat]] = pd.DataFrame(scaler.fit_transform(df),columns=feat)
return df
df_atp.drop("Comment", axis=1, inplace=True) #### Lets get rid off ths columns because it doesn't seem to useful for us...
# df_atp.columns
Here is where we're gonna create the base from where our model will try to understand what is going on to predict if certain row with all the players variables results in "win" or "lose" (1 or 0). But let's see what happens swapping only the name's variables (that is to say, we're not going to change whatever the other variables are... "WRank", "LPts" etc...). You can imagine that this will kind of mess with the learning process as the machine will struggle to understand to which player the variables belong since we are now swapping the names in only some cases.
df_atp["Labels"] = df_atp.apply(lambda row: 1 if row["Winner"] < row["Loser"] else 0, axis=1)
df_atp["Player1"] = df_atp.apply(lambda row: row["Winner"] if row["Winner"] < row["Loser"] else row["Loser"], axis=1)
df_atp["Oponent"] = df_atp.apply(lambda row: row["Loser"] if row["Winner"] < row["Loser"] else row["Winner"], axis=1)
display(df_atp[["Winner", "Loser", "Labels","Player1","Oponent"]].head(5))
We can see that this method doesn't leave our dataset "umbalanced":
print(df_atp[df_atp["Labels"] == 1].shape[0])
print(df_atp[df_atp["Labels"] == 0].shape[0])
Now lets check the possible errors we main have hidden somewhere in the middle of our dataset!! To do so, we'll try to check the columns that are represented by "dtype=object"!!
cols = df_atp.columns[df_atp.dtypes.eq(object)]
print(cols)
Nice!! But lets keep for now the 6 first as I want to create categorical features from them for the 2nd model!
#df_atp.head()
cols.drop(["Location","Tournament", "Series","Court","Surface","Player1","Oponent","Round"])
display(df_atp.isnull().sum())
cols = df_atp.columns[11:].drop(["Labels","Player1","Oponent"])
cols
So lets grab all the columns with errors and apply a function to coerce the errors as "NaN" values. Like that, we can get rid of multiple types of errors like "30,,2" a value that has nothing than a string space " " where it should have a number, for instance. They will all be transformed to "NaN". Then it will be straight forward to manipulate the Nan!
df_atp.loc[:, cols] = df_atp.loc[:, cols].apply(pd.to_numeric, errors='coerce')
You can do whatever you want with the NaN (replace them with the mean value of each column, the max, min etc...) For a matter of simplicity, I'll replace all of them with the maximum value found on that column:
df_atp = df_atp.dropna(axis=1, how="all")
for each in cols:
df_atp[each] = df_atp[each].fillna(df_atp[each].max())
df_atp.head()
For a matter of checking:
for each in cols[-5:]:
print(f"max value for {each} : {df_atp[each].max()}")
As we'll have 4 models, I'll define a function to drop chosen columns:
#dt = df4.copy()
dt = df_atp.copy()
def drop_cols(df): ### Pour faciliter la tâche, étant donné que nous avions prevu de répéter cette opération plusieurs fois pour d'autres types de modèles.
drop_col = [x for x in df.columns[0:8]]+["Winner",'Loser',"Player1","Oponent"]
df.drop(drop_col,axis=1,inplace=True)
#dt.tail()
drop_cols(dt)
# dt.head()
# dt.columns
Standardizer!
st_feat = dt.columns[:].drop("Labels") ## les données a standardizer!
st_feat
dt = standard(dt,st_feat)
dt.head()
Mettons enfin le dataset pour entrainer et obtenir le Score AUC pour la validation:
dt.to_csv("data_model1.csv")
train,valid, test = get_data_splits(dt)
over = False ### Parameter telling if we are overfitting or not!
train_model(train,valid,test,over,1)
Surprised with the results? Quite a poor performance, isn't? Lets try to understand it.
So lets try now to help our model creating other types of variables. Lets increase the complexitiy of our model. We can expect a better perfomrance.
Lets see now if creating categorical features with LabelEncoder will help our model understand the data better.
dt = df4.copy()
dt = df_atp.copy()
dt.head()
But first, you may ask: why categorize these features? In what they could help the algorithm better predict the outcome of a match? At a first glance, we could not see how a variable that tells the match city could influence the algorithm's outcome...
Pourquoi catégorizer ces variables?? En quoi pourraient elles aider l'algorithime à meilleur prévoir le résultat? Bon, à un premier coup d'oeil, nous ne pensons pas qu'une variable qui informe le lieu du matche pourrait avoir une influence sur le résultat.
Mais que diriez vous de votre performance a La Javaness si vous deviez travailler dans une petit chambre 3x4 a la chaleur de 36% et une humidité de 40%? Auriez-vous la même performance? (peut-être un brésilien oui XD )
Dans le même sense, sauriez-vous capable de trouver les bonnes qualités et la valeur ajoutée de ce script ici (oui, je parle bien de tout ce script écrit par un candidat au poste de DS a La Javaness) si vous l'examineriez en Décembre (ne seriez vous tenté aux vacances ou bien au bon repas de Noël? =) La Date peut quand même être important non seulement au niveau personelle de chaque jouer, mais aussi au niveau de sa performance professionelle). Les mêmes considérations peuvent être faites pour les autres variables.
Nous n'affirmons pas que ces informations ci-dessus soient éffectivement présentes dans le dataset. Elles devraient cependant être considerées comme implicites et faisant partie d'une hipothèse plutôt qu'une assertion.
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
cat = ["Location", "Tournament", "Date", "Series", "Court", "Surface", "Round", "Player1", "Oponent"]
for each in cat:
le = preprocessing.LabelEncoder()
LabelEncoder()
le.fit(dt[each])
name = each + "_cat"
ser = le.transform(dt[each])
dt[name] = ser
dt.head()
drop_cols(dt)
#dt.head()
st_feat = dt.columns.drop("Labels")
print(st_feat)
dt = standard(dt,st_feat)
dt.head()
dt.to_csv("data_model2.csv")
train,valid, test = get_data_splits(dt)
train_model(train,valid,test,over,2)
So we increased our validation score. As we expected, right? If we compare the values from the Confusion Matrix we can learn that:
We can conclude that this model were more sensible to the negative cases. He undestood better that positive cases were not negative (lower FPR) and negative cases were indeed negative cases (higher TNR)
However, we still have a very poor test score!! Can you tell me why? Well, since the test is also a data never seen by our model, it's quite normal that the model misses the prediction if he's asserting from bad information (in our case the messy-not-swapped-columns). Moreover, if you take into consideration that some players only appear on the test data (check this visualization from Facets Overview in my previous notebook: https://www.kaggle.com/danielfmfurlan/eda-tennis) we will enhance this comprehnesion: bad learning from the training leads to bad predictions for unseen data!
Lets give another chance to our model and try to catch only the most important features (lets see if the RandomForest can see that a lot of the informations going on in our model are indeed messing with the learning and thus should be discarded)
If you want to visualy get a insight about the correlation between the features, uncomment the next cell to see a HeatMap:
# cor = dt.corr()
# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()
# plt.figure(figsize=(28,20))
# import numpy as np
# mask = np.zeros_like(cor)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
# f, ax = plt.subplots(figsize=(28, 20))
# ax = sns.heatmap(cor, center = 0, linewidth = 0.9, vmin = -1, vmax = 1,
# cmap = sns.color_palette("RdBu_r", 7),annot = False, mask=mask, square=True, fmt='.g')
# corr_m = cor.abs()
# sol = (corr_m.where(np.triu(np.ones(corr_m.shape), k=1).astype(np.bool))
# .stack()
# .sort_values(ascending=False))
# print("les 10 variables qui ont une plus forte correlation : \n",sol[:10])
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
X = dt.drop("Labels", axis=1)
display(X.columns)
y=dt.loc[:, "Labels"]
feats = RandomForestClassifier(n_jobs=-1)
feats.fit(X, y)
plt.figure(figsize=(10, 10))
imp = feats.feature_importances_
cols = dt.columns.drop("Labels")
imp, cols = zip(*sorted(zip(imp, cols)))
plt.barh(range(len(cols)), imp, align="center", color='blue');
plt.yticks(range(len(cols)), cols,fontsize=10)
plt.xticks(fontsize=10)
plt.title("Variables importants pour la Classification")
plt.xlabel("Relevance (%)",fontsize=12)
plt.tight_layout();
import numpy as np
th = 0.025
imp = np.array(imp)
most = cols[- np.where(imp > th)[0].shape[0] :]
print(f'les variables plus importantes qui influencent plus le résultat :\n {most}')
So now we are taking 50% good information and 50% messy information (5 variables come from the Categorization and thus, are swapping-column-prof). We expect a better performance!
dt = dt[[*(most + ("Labels",))]]
dt.head(5)
dt.to_csv("data_model3.csv")
train,valid, test = get_data_splits(dt)
train_model(train,valid,test,over,3)
The validation score increased only a few. We can assess this variation analysing the ROC curve or the confusion matrix. The better performance came from the lower fnr (ratio: and higher tpr (ratio: 1.032) . That is to say: our model was better to understand that some positive cases were indeed positive (labels 1 predicted as 1!) and that some positive cases were NOT negative (labels 1 were not predicted as 0). It seem a bit obvious saying like that but is just a matter to discretize the data into categories. And from that informations we can start thinking, for instance, if there is any difference for the weights of our model when it's a sample with label 1 or label 0.
Now, let's see what happen if we swap the columns correctly (accordingly to the alphabetic order that we got from the "Winner" and "Loser" columns). We will also have a parameter called "noise" that is to preserve some columns without swapping them (so maintaining a noise to our model, as you'll see that he'll easily overfit)
dt = df_atp.copy()
dt.head()
dt.columns
def noisy(noise,d):
feat_w_all = ["WRank","W1","W2","W3","W4","W5","Wsets","CBW","GBW","IWW","SBW","B365W","B&WW","EXW","PSW","WPts","UBW","LBW","SJW","MaxW","AvgW"]
feat_l_all = ["LRank","L1","L2","L3","L4","L5","Lsets","CBL","GBL","IWL","SBL","B365L","B&WL","EXL","PSL","LPts","UBL","LBL","SJL","MaxL","AvgL"]
per = float(1 - noise)
#display(feat_w)
length = len(feat_w_all)
#display((length))
ft = int(length*per)
#display(ft)
if d == "l":
feat_w = feat_w_all[:ft]
feat_l = feat_l_all[:ft]
if d == "r":
feat_w = feat_w_all[-ft:]
feat_l = feat_l_all[-ft:]
print("you are choosing this features to swap : \n",feat_w,"\n",feat_l)
#return feat_w_all[:ft],feat_l_all[:ft]
return feat_w, feat_l
def swap(serW,serL):
for idx,each in serW.items():
#print(each,idx)
loserItem = serL[idx]
name = "winner_" + each
#print("loser item : ", serL[idx])
dt[name] = dt.apply(lambda row: row[each] if row["Winner"] < row["Loser"] else row[loserItem], axis=1)
for idx,each in serL.items():
#print(each,idx)
winnerItem = serW[idx]
#print("loser item : ", serL[idx])
name = "loser_" + each
dt[name] = dt.apply(lambda row: row[each] if row["Winner"] < row["Loser"] else row[winnerItem], axis=1)
######################" Define the quantity of noise to add to our model in order to not overfit. Put it in decimal number
noise = 0.8
side = "r" ####### get the features from left "l" or right "r". This is because the WRank and LRank are really important features to our model.
feat_w,feat_l = noisy(noise,"l")
serW = pd.Series(feat_w)
serL = pd.Series(feat_l)
swap(serW,serL)
###################################### To create dataset for What-if-Tool
############### Need first to get the df_atp with "Player1" & "Oponent" and get rid of NaN values!
# feat_w = ["WRank"]
# feat_l = ["LRank"]
# serW = pd.Series(feat_w)
# serL = pd.Series(feat_l)
# swap(serW,serL)
# display(dt.columns)
# dt.drop(['W1', 'L1','WRank','LRank',
# 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'CBW',
# 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL', 'B365W', 'B365L',
# 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL','UBW',
# 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW', 'MaxL', 'AvgW', 'AvgL'],axis=1,inplace=True)
# dt.drop(["Winner","Loser"],axis=1,inplace=True)
# display(dt.columns)
# dt.to_csv("data_model4_wit_swap.csv")
def drop_cols(df):
drop_col = [x for x in df.columns[0:9]]+["Winner",'Loser',"Player1","Oponent"] + feat_w + feat_l
df.drop(drop_col,axis=1,inplace=True)
drop_cols(dt)
# dt.to_csv("data_model4_no_std.csv")
display(dt.head())
win_Rank = dt["winner_WRank"].copy()
los_Rank = dt["loser_LRank"].copy()
st_feat = dt.columns.drop("Labels")
print(st_feat)
dt = standard(dt,st_feat)
dt.head()
dt.to_csv("data_model4.csv")
train,valid, test = get_data_splits(dt)
print("len of train : ", len(train))
over = True
train_model(train,valid,test,over,4)
You can see from the results that this model overfitted even creating noise and penalizing it with L1=L2=0.5!!!
This shows how LightGBM can fastly converge to the minimum loss of our function. If you want to avoid this overfitting, you can shuffle the data and then you'll see that the performance will fall!!
##### ALL ROC CURVES FROM 4 MODELS
Ax = plt.figure(figsize=(35,10))
colors = ['b','r','g','y']
for idx in roc:
recall = roc[idx]['recall']
false_positive_rate = roc[idx]['fpr']
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, color=colors[idx-1], label = f"Model {idx}")
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall',fontsize=20)
plt.xlabel('Fall-out (1-Specificity)',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
If you want to get only some features and so to play with the RandomForest Classifier, go ahead and uncomment the next cell and choose some threshold! (here is 0.025)
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
# xs = {"lab":"TOEFL Score_admitVar", 1:train["TOEFL Score_admitVar"]}
# zs = {"lab":"TOEFL Score",1:train["TOEFL Score"]}
# ys = {"lab":"Chance of Admit",1:train["Chance of Admit "]}
y_pred = y_predictions[4]
fig = px.scatter_3d(valid, x=win_Rank[-5229:], y=los_Rank[-5229:], z=valid["Labels"],
color=y_pred)
fig.show()
From the above graph we can see that, despite the almost flawless aspect of our model, the errors commited have one commom thing: the LRank and WRank were mostly at the same plateau (that is to say, it is mostly improbable to miss the prediction if the difference from WRank and LRank is too big). But is interesting to observe that the False Negatives (the blue dots among the red ones) came mostly from a narrower delta of WRank than of the LRank (you can see that the blue dots are concetrated btw 23-56) whereas for False Positives (the red dots among the blue dots are more spread through a bigger delta of both LRank AND WRank
# X = dt.drop("Labels", axis=1)
# display(X.columns)
# y=dt.loc[:, "Labels"]
# feats = RandomForestClassifier(n_jobs=-1)
# feats.fit(X, y)
# plt.figure(figsize=(10, 10))
# imp = feats.feature_importances_
# cols = dt.columns.drop("Labels")
# imp, cols = zip(*sorted(zip(imp, cols)))
# plt.barh(range(len(cols)), imp, align="center", color='blue');
# plt.yticks(range(len(cols)), cols)
# plt.title("Variables importants pour la Classification")
# plt.xlabel("Relevance (%)")
# plt.tight_layout();
# import numpy as np
# th = 0.025 ################## THRESHOLD!!! Change if you want!
# imp = np.array(imp)
# most = cols[- np.where(imp > th)[0].shape[0] :]
# print(f'les variables plus importantes qui influencent plus le résultat :\n {most}')
# dt = dt[[*(most + ("Labels",))]]
# display(dt.head(5))
# train,valid, test = get_data_splits(dt)
# train_model(train,valid,test,over,5)
import pandas
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import keras
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
train,valid, test = get_data_splits(dt)
X = train.drop("Labels",axis=1) #[:,0:60].astype(float)
Y = train["Labels"]
# define model
model = Sequential()
model.add(Dense(11, activation='relu'))
model.add(Dense(60,activation='relu'))#input_dim=60
model.add(Dense(1, activation='sigmoid'))
# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
history = model.fit(X, Y, epochs=10, batch_size=32, verbose=1)
# evaluate the model
scores = model.evaluate(X, Y, verbose=1)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# save model and architecture to single file
model.save("model.h5")
print("Saved model to disk")
#### Get validation scores
X_valid = valid.drop("Labels",axis=1)
from sklearn.preprocessing import MinMaxScaler
y_pred = model.predict_proba(X_valid)
print("lenght ynew : ", len(y_pred))
# print("X=%s, Predicted=%s" % (Xnew.iloc[0], ynew[0]))
#### Plot Accuracy graph:
print(history.history.keys())
fig = plt.figure(figsize=(35,10))
# history.history['accuracy']
plt.plot(history.history['accuracy'], color='blue', label='train')
plt.xlabel('Epochs',fontsize=20)
plt.ylabel('auc',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("AUC during training",fontsize=20)
plt.legend(fontsize=20)
plt.show()
### LOAD KERAS MODEL:
# from keras.models import load_model
# # load model
# model = load_model('model.h5')
plot(None,y_pred,None,None,test,5)
#### Accuracy curve for 3 models
# # acu[n] = evals_result
# ax = plt.figure(figsize=(45,10))
# #print('Plot metrics during training... Our metric : ', param["metric"])
# #print("evals_ results : ", evals_result)
# for idx,each in acu.items():
# #print(acu[idx])
# lgb.plot_metric(acu[idx], metric='auc',figsize=(35,10))
# plt.xlabel('Iterations',fontsize=20)
# plt.ylabel('auc',fontsize=20)
# plt.xticks(fontsize=20)
# plt.yticks(fontsize=20)
# plt.title("AUC during training",fontsize=20)
# plt.legend(fontsize=20)
# plt.show()
# import numpy as np
# df_atp.WRank = pd.to_numeric(df_atp.WRank, errors = 'coerce')
# df_atp.LRank = pd.to_numeric(df_atp.LRank, errors = 'coerce')
# # New Feature: Rank difference betweehn the 2 oponents
# df_atp['Diff'] = df_atp.LRank - df_atp.WRank
# # New Feature: Round the rank difference to 10's and 20's
# df_atp['Round_10'] = 10*round(np.true_divide(df_atp.Diff,10))
# df_atp['Round_20'] = 20*round(np.true_divide(df_atp.Diff,20))
# # New Feature: Total number of sets in the match
# df_atp['Total Sets'] = df_atp.Wsets + df_atp.Lsets
# df_atp['Sets Diff'] = df_atp.W1+df_atp.W2+df_atp.W3+df_atp.W4+df_atp.W5 - (df_atp.L1+df_atp.L2+df_atp.L3+df_atp.L4+df_atp.L5)
# new_df = df_atp
# # 2 New Data Frames: Grand Slam data frame (GS) and non-Grand Slam data frame (non GS)
# df_non_GS = new_df[~(new_df.Series == 'Grand Slam')]
# df_GS = new_df[new_df.Series == 'Grand Slam']
# #%% Winning probability vs Rank Difference
# plt.figure(figsize = (10,10))
# bins = np.arange(10,200,10)
# Gs_prob = []
# non_Gs_prob = []
# for value in bins:
# pos = value
# neg = -value
# pos_wins = len(df_GS[df_GS.Round_10 == pos])
# neg_wins = len(df_GS[df_GS.Round_10 == neg])
# Gs_prob.append(np.true_divide(pos_wins,pos_wins + neg_wins))
# pos_wins = len(df_non_GS[df_non_GS.Round_10 == pos])
# neg_wins = len(df_non_GS[df_non_GS.Round_10 == neg])
# non_Gs_prob.append(np.true_divide(pos_wins,pos_wins + neg_wins))
# plt.bar(bins,Gs_prob,width = 9, color = 'black')
# plt.bar(bins,non_Gs_prob,width = 8, color = 'grey')
# plt.title('Probabilité vs Difference de Rank', fontsize = 30)
# plt.xlabel('Difference de Rank',fontsize = 15)
# plt.ylabel('Probabilité de Gagner',fontsize = 15)
# plt.xlim([10,200])
# plt.ylim([0.5,0.9])
# plt.legend(['grand slams', 'Non grand slams'], loc = 1, fontsize = 15)
# plt.show()
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
fig = px.scatter_3d(df4, x=df4["WRank"], y=df4["LRank"], z=df4["winner_past_vict"],
color=df4["loser_past_vict"])
fig.show()
df_atp["Surface"].value_counts()
y = df_atp["Surface"].unique()
plt.figure(figsize=(15, 10))
#plt.subplot(131)
plt.ylabel("Surfaces")
e = plt.bar(y,df_atp["Surface"].value_counts())#color=['black', 'red', 'green', 'blue', 'cyan','orange','gray'])
#e[0].set_color('r')
#e[4].set_color("g")
plt.xticks()
#plt.ylim(0.0,0.003)
plt.tick_params(axis='x', colors='red')
plt.rc('xtick',labelsize=15)
plt.rc('ytick',labelsize=15)
plt.suptitle('Nombre des matches sur chaque champ')
plt.show()
Ce dataset comprend une participation beaucoup plus importante pour la Surface "Hard". Aurait il une influence le type de surface pour chaque jouer? (a une prochaine analyse)
Animated graph to see the Rank progression of Federe and Enqvist from 2000 to 2005!
win = df4.groupby("Winner")
fed = win.get_group('Federer R.')
esq = win.get_group("Enqvist T.")
#dos = win.get_group("Dosedel S.")
display(len(fed))
display(len(esq))
#display(len(dos))
#### FED & ESQ
re = pd.DataFrame(columns = fed.columns)
#display(re)
for idx, x in fed["Date"].isin(esq["Date"]).items():
if x == True:
re = re.append(fed.iloc[fed.index == idx])
re.head()
reEsq = esq.copy()
reEsq.set_index(reEsq.columns[0])
#display(reEsq)
#reEsq.index
for idx, x in esq["Date"].isin(re["Date"]).items():
if x == False:
reEsq.drop(idx,axis=0,inplace=True)
#reEsq.head()
#### FED & ESQ
display(len(reEsq))
display(len(re))
#### FED & ESQ
Final = pd.concat([reEsq,re])
tim = Final.groupby("Date")
#tim.first()
Final['NewDate'] = pd.to_datetime(Final['Date']) ## creating "date" timestamp from the "Date" string
Final['NewDate'] = Final['NewDate'].dt.strftime('%d-%m-%Y')
import plotly.express as px
fig = px.bar(Final, x="Winner", y="WRank", color="Winner",
animation_frame="NewDate", animation_group="Winner", range_y=[0,200])
fig.show()
Travail futur: